

import pytesseract
from PIL import Image

def getStrByImg(_img):
    # 读取图片
    im = Image.open(_img)
    # 识别文字，并指定语言
    string = pytesseract.image_to_string(im, lang='chi_sim')
    #string = pytesseract.image_to_string(im)
    return string

import cv2
import pytesseract
from pytesseract import Output
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
import numpy as np
import re

from difflib import SequenceMatcher

def similarity(s1, s2):
    """
        字符串相似比较，返回相似度
        similarity_ratio >= 0.8:
    """
    return SequenceMatcher(None, s1, s2).ratio()
def split_num_content(str):
    # s = '1/88 some text'
    #过滤题目中的题号
    str = str[str.find('/')+1:]
    pos = 0
    for i in range(len(str)):
        if not str[i].isdigit():
            pos = i
            break
    return str[pos:]
def filter_select_item(str):
    if str[0] == 'A' or str[0] == 'B' or str[0] == 'D' or str[0] == 'C':
        return True
    return False
def filter_select_list(list):
    #根据ABCD合并语句
    bmergerTopic = True  #合并标题
    pos = 0
    if 1 == check_question_type(list):
        while pos < len(list):
            if not filter_select_item(list[pos]):
                if 0 == pos:
                    pos = pos + 1
                    continue
                list[pos-1] = list[pos-1] +  list[pos]
                del list[pos]
            else:
                pos=pos + 1
        return
    #判断题
    del_after = False
    while pos < len(list):
        if del_after:
            del list[pos]
            continue
        if list[pos] == ' ':
            del list[pos]
            continue
        if not filter_select_item(list[pos]):
            if bmergerTopic:
                if 0 == pos:
                    pos=pos + 1
                    continue
                list[pos-1] = list[pos-1] +  list[pos]
                del list[pos]
                continue
            #pos=pos + 1
            del_after = True #删除B选项后面的内容
        else:
            bmergerTopic = False
            pos=pos + 1
    
def check_question_type(list):
    #根据ABCD判断是否为判断题
    for item in list:
        if item[0] == 'D':
            return 1
    return 0
    pass

def find_question_result(list):
    #找到结果
    for item in list:
        if 0 == item.find("正确答案:"):
            if item[len(item)-1:] == "8" or item[len(item)-1:] == 'B':
                return 'B'
            if item[len(item)-1:] == 'A':
                return 'A'
    return ''
    pass

def recoText(im):
    """
    识别字符并返回所识别的字符及它们的坐标
    :param im: 需要识别的图片
    :return data: 字符及它们在图片的位置
    """
    bstart = False
    data = []
    linlist =[]
    strLine =""
    pattern = r'第(\d+)题'

    pilimg = Image.fromarray(im)
    draw = ImageDraw.Draw(pilimg)
    d = pytesseract.image_to_data(im, output_type=Output.DICT, lang='chi_sim')
    for i in range(len(d['text'])):
        if 0 < len(d['text'][i]):
            strLine += d['text'][i]
                
            print(d['text'][i],end="")
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            data.append((d['text'][i],[d['left'][i], d['top'][i], d['width'][i], d['height'][i]]))
 
            #cv2.rectangle(im, (x, y), (x + w, y + h), (0, 0, 0), 1)
            # 使用cv2.putText不能显示中文，需要使用下面的代码代替
            #cv2.putText(im, d['text'][i], (x, y-8), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 0, 0), 1)
            
            # 参数1：字体文件路径，参数2：字体大小
            font = ImageFont.truetype("simhei.ttf", 15, encoding="utf-8")
            # 参数1：打印坐标，参数2：文本，参数3：字体颜色，参数4：字体
            draw.text((x, y-10), d['text'][i], (0, 0, 0), font=font)
            draw.rectangle([(x, y), (x + w, y + h)],outline ="red")
            
        else:
            if len(strLine) > 0:
                matches = re.findall(pattern, strLine)
                if len(matches) > 0:
                    bstart = True
                
                if not bstart:
                    strLine = ""
                    continue
                linlist.append(strLine)
                strLine = ""
            pass;#print("")
    
    print("过滤前列表",linlist)

    linlist[1] = split_num_content(linlist[1])  #过滤题号
    del linlist[0]   #删除题号

    print("答案:",find_question_result(linlist))  #获取答案
    #过滤选择项目
    filter_select_list(linlist)

    im = cv2.cvtColor(np.array(pilimg), cv2.COLOR_RGB2BGR)

    #cv2.imshow("recoText", im)
    #cv2.waitKey()
    #cv2.destroyAllWindows()
    print("过滤前列表后",linlist)
    return linlist


if __name__ == '__main__':
    #img = cv2.imread(r'ocr-test.png')
    #img = cv2.imread(r'result.png')
    img = cv2.imread(r'tf_question.png')
    
    #cv2.namedWindow("src",cv2.WINDOW_NORMAL)
    #cv2.imshow("src", img)
    #cv2.waitKey()s = '1/88 some text'
    data = recoText(img)
    
